knitr::opts_knit$set(root.dir = "~/Dropbox (EHA)/repositories/gains-summary") knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE) # Load required packages library(plyr) library(dplyr) library(magrittr)
# Load China data china_full <- read.csv("inst/rawdata/CHINA.csv", as.is = TRUE) keeps <- c(1:4, 7, 8, 14, 16:20, 23, 24, 28, 40, 49:52, 55, 56, 60, 137, 138, 139) china <- china_full[, keeps] names(china) # Load Kevin's data kevin <- read.csv("inst/rawdata/KevinP1rank.csv", as.is = TRUE) names(kevin) # Transform virus name variables so they're matchable. library(stringr) kevin$virusname <- str_replace_all(kevin$virusname, " ", "") select_first <- function(x) { str_split(x, "\\|")[[1]][1] } china$VirusName <- sapply(china$VirusName, select_first, USE.NAMES = FALSE) china$VirusName[china$VirusName == "theSARSRelatedCoronavirusHKU3"] <- "SARSRelatedCoronavirusHKU3" china$VirusName[china$VirusName == ""] <- NA # Merge the data frames china <- left_join(china, kevin, by = c("VirusName" = "virusname")) china$positive <- 0 china[!is.na(china$VirusName), "positive"] <- 1
viruses_per_animal <- china %>% group_by(AnimalID..GAINS.) %>% summarize(viruses_matched = length(unique(na.omit(VirusName)))) %>% group_by(viruses_matched) %>% summarize(individuals = length(viruses_matched)) viruses_per_animal$percent <- round(viruses_per_animal$individuals / sum(viruses_per_animal$individuals) * 100, 2) knitr::kable(viruses_per_animal, col.names = c("Viruses per animal", "Number of animals", "Percentage"), caption = "Individual animals by sum of viruses")
total_tested <- length(unique(china$AnimalID..GAINS.)) total_positive <- length(unique(china$AnimalID..GAINS.[china$positive > 0])) total_risk1 <- length(na.omit(unique(china$AnimalID..GAINS.[china$Risklevel == 1])))
r total_tested
r total_positive
r total_risk1
animals <- china %>% group_by(SpeciesScientificName) %>% summarize(SpeciesCommonNameEnglish = unique(SpeciesCommonNameEnglish), number = length(unique(AnimalID..GAINS.))) %>% arrange(desc(number)) names(animals) <- c("scientific", "common", "number") animals$common <- sapply(animals$common, function(x) strsplit(x, " within")[[1]][1]) knitr::kable(animals, col.names = c("Scientific name", "Common name", "Number tested"), caption = "Number of individuals tested per animal species")
PIG <- china %>% group_by(PrimaryInterfaceGroup) %>% summarize(number = length(unique(AnimalID..GAINS.))) %>% arrange(desc(number)) PIG$percent <- round(PIG$number / sum(PIG$number) * 100, 2) knitr::kable(PIG, col.names = c("Primary interface group", "Number of individuals", "Percentage"), caption = "Number of individuals tested per primary interface group")
SIG <- china %>% group_by(SecondaryInterfaceGroup) %>% summarize(number = length(unique(AnimalID..GAINS.))) %>% arrange(desc(number)) SIG$percent <- round(SIG$number / sum(SIG$number) * 100, 2) knitr::kable(SIG, col.names = c("Secondary interface group", "Number of individuals", "Percentage"), caption = "Number of individuals tested per secondary interface group")
china$risk1 <- 0 china$risk1[china$Risklevel == 1] <- 1 risk1_crosstab <- china %>% group_by(AnimalID..GAINS.) %>% filter(positive == 1) %>% summarize(risk1 = ifelse(mean(risk1) > 0, 1, 0), viruses_matched = length(unique(na.omit(VirusName)))) %>% group_by(viruses_matched) %>% summarize(individuals = length(viruses_matched), risk1 = sum(risk1)) risk1_crosstab$percent <- round(risk1_crosstab$risk1/risk1_crosstab$individuals * 100, 2) knitr::kable(risk1_crosstab, col.names = c("Viruses", "Individuals, total", "Individuals, 'high risk'", "Percentage"), caption = "Cross tabs of number of viruses in individuals by presence of risk level 1 virus")
riskfish <- risk1_crosstab[, 2:3] riskfish$other = riskfish$individuals - riskfish$risk1 riskfish$individuals <- NULL riskfishtest <- fisher.test(riskfish)
Just out of interest, I ran Fisher's exact test on the data. The p-value was $r (riskfishtest$p.value)
$.
risk1_viruses <- china %>% filter(Risklevel == 1) %>% group_by(VirusName) %>% summarize(animals = length(unique(na.omit(AnimalID..GAINS.)))) %>% select(VirusName, animals) %>% arrange(desc(animals)) knitr::kable(risk1_viruses, col.names = c("Virus name", "Number of animals"), caption = "'High risk' viruses from China GAINS data")
risk1_individual <- china %>% group_by(AnimalID..GAINS.) %>% summarize(scientific = unique(SpeciesScientificName), common = unique(SpeciesCommonNameEnglish), risk1 = ifelse(mean(risk1) > 0, 1, 0), PIG = unique(PrimaryInterfaceGroup), SIG = unique(SecondaryInterfaceGroup)) risk1_species <- risk1_individual %>% group_by(scientific) %>% summarize(common = unique(common), animals = length(unique(AnimalID..GAINS.)), risk1 = length(risk1[risk1 == 1]), percent = round(risk1 / animals * 100, 2)) %>% filter(risk1 > 0) %>% arrange(desc(percent)) risk1_species$common <- sapply(risk1_species$common, function(x) strsplit(x, " within")[[1]][1]) knitr::kable(risk1_species, col.names = c("Scientific name", "Common name", "Tested", "'High risk'", "%"), caption = "Number of animals with 'high risk' viruses per species")
risk1_PIG <- risk1_individual %>% group_by(PIG) %>% summarize(animals = length(unique(AnimalID..GAINS.)), risk1 = length(risk1[risk1 == 1]), percent = round(risk1 / animals * 100, 2)) %>% arrange(desc(percent)) knitr::kable(risk1_PIG, col.names = c("Primary interface group", "Tested", "'High risk'", "%"), caption = "Number of animals with 'high risk' viruses per primary interface group")
risk1_SIG <- risk1_individual %>% filter(!(SIG == "")) %>% group_by(SIG) %>% summarize(animals = length(unique(AnimalID..GAINS.)), risk1 = length(risk1[risk1 == 1]), percent = round(risk1 / animals * 100, 2)) %>% arrange(desc(percent)) knitr::kable(risk1_SIG, col.names = c("Secondary interface group", "Tested", "'High risk'", "%"), caption = "Number of animals with 'high risk' viruses per secondary interface group")
viruses <- table(china$VirusName[!(china$VirusName == "")]) viruses <- data.frame(viruses) viruses <- viruses[order(viruses$Freq, decreasing = TRUE), ] names(viruses) <- c("VirusName", "freq") viruses$VirusName <- as.character(viruses$VirusName) knitr::kable(head(viruses, 15), col.names = c("Virus name", "Frequency"), caption = ("Top 15 viruses from GAINS in China")) # For my own edification, another way of doing this. # viruses <- group_by(china, VirusName) # viruses <- summarize(viruses, freq = length(VirusName)) # viruses <- viruses[order(viruses$freq, decreasing = TRUE), ] # knitr::kable(viruses)
prev_by_risk <- china %>% # dplyr and magrittr group_by(Risklevel) %>% summarize(total = sum(positive), prevalence = (sum(positive) / nrow(china)) * 100) prev_by_risk$prevalence <- round(prev_by_risk$prevalence, 2) knitr::kable(prev_by_risk, col.names = c("Risk level", "Total", "Prevalence"), caption = "Sample prevalence of viruses by risk level")
A table of the number of total matches for each animal.
prev_by_animal <- china %>% # filter(positive > 0) %>% group_by(AnimalID..GAINS.) %>% summarize(total = sum(positive)) %>% arrange(desc(total)) prev_by_animal <- data.frame(table(prev_by_animal$total)) prev_by_animal$Var1 <- as.numeric(as.character(prev_by_animal$Var1)) prev_by_animal$Perc <- round(prev_by_animal$Freq / sum(prev_by_animal$Freq) * 100, 2) knitr::kable(prev_by_animal, col.names = c("Number of viruses", "Frequency in animals", "Percentage of animals"), caption = "Frequency of animals with 1, 2, and 3 matched viruses")
N.B. Kevin says that it's best to use the AVE_Manual
variable, as Risklevel
is really based on arbitrary cutoffs and was created for USAID; better to preserve the continuous nature of the ranking if we don't need to do stuff categorical.
There are r nrow(unique(china[, c("SpeciesScientificName", "SpeciesCommonNameEnglish")]))
species in the database.
The following table shows the highest risk level of virus found in each species.
by_species <- china %>% group_by(SpeciesScientificName) %>% summarize(highest_risklevel = min(na.omit(Risklevel))) %>% arrange(highest_risklevel) by_species$highest_risklevel[is.infinite(by_species$highest_risklevel)] <- NA knitr::kable(by_species, col.namse = c("Species", "Any match", "Highest risk level of positive match"), caption = "Summaries of virus matches by species")
Note: A risk level of NA
indicates no matches in that species.
I'm not actually sure how useful the next table will be, but I wanted to get a measure of virodiversity for tested species. The table shows the mean number of virus species found in individual animals (across all specimens, which I'm sure introduces bias—I'm sure Simon could tell us about that) for each species.
mean_viruses_by_species <- china %>% group_by(AnimalID..GAINS.) %>% summarize(num_species = length(unique(na.omit(VirusName))), SpeciesScientificName = SpeciesScientificName[1]) %>% group_by(SpeciesScientificName) %>% summarize(num_tested = length(AnimalID..GAINS.), mean_species = sum(num_species)/num_tested) %>% arrange(desc(mean_species)) %>% filter(mean_species > 0.0) knitr::kable(mean_viruses_by_species, col.names = c("Species", "Number of animals tested", "Mean virus species"), caption = "Mean number of virus species found per individual for different species")
And the same for taxonomic group:
mean_viruses_by_species <- china %>% group_by(AnimalID..GAINS.) %>% summarize(num_species = length(unique(na.omit(VirusName))), SpeciesScientificName = SpeciesScientificName[1]) %>% group_by(SpeciesScientificName) %>% summarize(num_tested = length(AnimalID..GAINS.), mean_species = sum(num_species)/num_tested) %>% arrange(desc(mean_species)) %>% filter(mean_species > 0.0) knitr::kable(mean_viruses_by_species, col.names = c("Species", "Number of animals tested", "Mean virus species"), caption = "Mean number of virus species found per individual for different species")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.